import h5py
import warnings
import pandas as pd
import numpy as np
import seaborn as sb
import statsmodels.api as sm
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as sch
import scipy.spatial.distance as ssd
from scipy.stats import skew, kurtosis
from sklearn.decomposition import PCA
from sklearn.linear_model import LassoLarsCV
from statsmodels import regression
from pykalman import KalmanFilter
DATA CAN BE DOWNLOADED FROM: https://www.dropbox.com/s/dykmvg8j3lzxdcw/data.zip?dl=0
#Â import dataset
x = h5py.File('./X.h5','r')
y = h5py.File('./Y.h5','r')
# convert to pandas DataFrame
df_x = pd.DataFrame(x['X'][:])
df_y = pd.DataFrame(y['Y'][:])
# rename columns for readability
df_x.columns = ['x'+str(i+1) for i in df_x.columns]
df_y.columns = ['y'+str(i+1) for i in df_y.columns]
A first look at the data: the last one ('x56') seems to be a 0/1 variable
df_x.head()
From raw plots of the 56 X time-series, in different time windows, it can be clearly identified a clustering structure of shapes, made of 6 clusters approximately.
#t1 = 0
#t2 = 1000 #df_x.shape[0]
t2 = df_x.shape[0]
t1 = t2-1000
df_x.loc[t1:t2,:].plot(subplots=True, figsize=(15,5*56), legend=False, title=list(df_x.columns))
plt.show()
The similarity can be confirmed taking a look at the correlation matrix.
df_x_correlation = df_x.corr()
df_x_correlation
A heatmap is suitable to spot cluster in the correlation structure.
f, ax = plt.subplots(figsize=(20,20))
sb.heatmap(df_x_correlation, vmin=-1.0, vmax=1.0, square=True, cmap='bwr')
plt.show()
As can be seen, there are strong clusters of correlated time series (e.g. x1,...x22). We can employ a clustering approach to group together time-series based on their correlation. Moreover, the last two factors, 'x55' and 'x56' are completely scorrelated from the rest. We leave them out as singleton clusters and analyze the clustering strucuture of {'x1',...,'x54'}
df = df_x.loc[:,:'x54']
df.head()
We can perfrom a hierarchical clustering of the X dataset to better understand the grouping of the time series in X.
df_corr = df.corr()
d = sch.distance.pdist(df_corr)
Z = sch.linkage(d, 'average')
plt.figure(figsize=(25, 10))
labelsize=20
ticksize=15
plt.title('Hierarchical Clustering Dendrogram for X dataset', fontsize=labelsize)
plt.xlabel('X', fontsize=labelsize)
plt.ylabel('distance', fontsize=labelsize)
sch.dendrogram(
Z,
leaf_rotation=90., # rotates the x axis labels
leaf_font_size=8., # font size for the x axis labels
labels = df_corr.columns
)
plt.yticks(fontsize=ticksize)
plt.xticks(rotation=-90, fontsize=ticksize)
#plt.savefig('img/dendogram_'+index+'.png')
plt.show()
We can perform a PCA decomposition to alternatively understand the clusters and the 'main shapes' of the different clusters. We could then use the principal components as a reduced-X dataset.
def make_PCA(df, num_pc):
"""
Perform PCA analysis on 'df' datateset, keeping the first 'num_pc' principal components.
Returns:
- pca: sklearn.decomposition.pca.PCA object
- percentage: np.array of decreasing fractions of total variance explained.
- percentage_cum: np.array of cumulative fractions of total variance explained.
"""
pca = PCA(n_components=num_pc) # number of principal components
pca.fit(np.asarray(df))
percentage = pca.explained_variance_ratio_
percentage_cum = np.cumsum(percentage)
print('{0:.2f}% of the variance is explained by the first {1:3d} PCs'.format(percentage_cum[-1]*100, num_pc))
return pca, percentage, percentage_cum
def plot_PCA_contributions(df, num_pc=None):
"""
For dataset 'df', plots the fraction and cumulative fractions of the total variance explained by the first 'num_pc' compomnents.
If num_pc == None (default), then full PCA decomposition is performed.
Returns:
- pca: sklearn.decomposition.pca.PCA object
- percentage: np.array of decreasing fractions of total variance explained.
- percentage_cum: np.array of cumulative fractions of total variance explained.
"""
if num_pc is None:
num_pc = df.shape[1]
pca, percentage, percentage_cum = make_PCA(df, num_pc)
x = np.arange(1,len(percentage)+1,1)
fig = plt.figure(figsize=(20,6))
plt.subplot(1, 2, 1)
plt.bar(x, percentage*100, align = "center")
plt.title('Contribution of principal components to total variance',fontsize = 16)
plt.xlabel('Principal components',fontsize = 16)
plt.ylabel("Fraction of total variance (%)",fontsize = 16)
plt.xticks(x,fontsize = 16)
plt.yticks(fontsize = 16)
plt.xlim([0, num_pc+1])
plt.subplot(1, 2, 2)
plt.plot(x, percentage_cum*100,'ro-')
plt.xlabel('Principal components',fontsize = 16)
plt.ylabel("Fraction of total variance (%)",fontsize = 16)
plt.title('Cumulative contribution of principal components to total variance',fontsize = 16)
plt.xticks(x,fontsize = 16)
plt.yticks(fontsize = 16)
plt.xlim([1, num_pc])
return pca, percentage, percentage_cum
First, let us perform a full PCA decomposition.
_, _, _ = plot_PCA_contributions(df)
As it is evident, 6 components are enough to explain more than 95% of total variance. Precisely:
num_pc = 6
pca, _, _ = make_PCA(df, num_pc)
We can get the 6 'statistical' factors, projecting the original dataset onto the 6 dimensional space described be the 6 PCs (eigen-vectors of the covariance matrix of X).
df_x_pca = pca.transform(df)
# For clarity: what is implemented by the .transform() method of the PCA class,
# can reproduced manually as the following projection of the (de-meaned) dataset
#Â onto the pca.components_ eigenvectors:
#
# df_x_pca = (df_x - df_x.mean()).dot(pca.components_.T).values
df_PCx = pd.DataFrame(columns=['PCx'+str(i+1) for i in range(num_pc)],
index=df_x.index,
data=df_x_pca)
df_PCx.head()
We can take a look at those PCs. We observe the 'key' shapes of the series in X.
df_PCx.loc[t1:t2,:].plot(subplots=True, figsize=(15,5*5), legend=False, title=list(df_PCx.columns))
plt.show()
A first look at Y dataset clearly shows similarities between each $y_i$ $i=1,5$ and clusters synchrounously sampled X time series.
t1 = 0
t2 = 1000 #df_x.shape[0]
#t2 = df_y.shape[0]
#t1 = t2-1000
df_y.loc[t1:t2,:].plot(subplots=True, figsize=(15,5*6), legend=False, title=list(df_y.columns))
plt.show()
It is clear that both X and Y series feature structural changes. And these seem to be syncronous.
df_x.loc[t1:t2,'x1'].plot(color='r')
df_y.loc[t1:t2,'y1'].plot(color='b')
plt.legend()
Interestingly, the 'x56' spikes seem to be related with the structural changes in both X and Y.
df_x.loc[t1:t2,'x1'].plot(color='r')
df_y.loc[t1:t2,'y1'].plot(color='b')
df_x.loc[t1:t2,'x56'].plot(color='k')
plt.legend()
Let's have a look at returns time series
df_x.loc[t1:t2,'x1'].diff()[1:].plot(color='r')
df_y.loc[t1:t2,'y1'].diff()[1:].plot(color='b')
#df_x.loc[t1:t2,'x56'].plot(color='k')
plt.legend()
plt.show()
plt.scatter(df_x.loc[t1:t2,'x1'].diff()[1:], df_y.loc[t1:t2,'y1'].diff()[1:]) # Plot the raw data
plt.xlabel('X Value')
plt.ylabel('Y Value')
#plt.ylim([-10,10])
#plt.xlim([-10,10])
plt.show()
def splitTrainAndTestSeries(x,y,frac):
"""
Arg:
x: pd.DataFrame of X time series
y: pd.DataFrame of Y time series
frac: fraction of the total sample to keep as training set (0 < frac <= 1)
return:
x_trainHalf, x_testHalf: original X split into train and test periods, test half is re-indexed to start from 0.
y_trainHalf, y_testHalf: original Y split into train and test periods, test half is re-indexed to start from 0.
T_train, T_test: length of train and test halves
"""
T = x.shape[0]
# X
x_trainHalf = x.loc[:int(frac*T),:]
x_testHalf = x.loc[int(frac*T)+1:,:]
#Â Y
y_trainHalf = y.loc[:int(frac*T),:]
y_testHalf = y.loc[int(frac*T)+1:,:]
# train and test halves length
T_train = x_trainHalf.shape[0]
T_test = x_testHalf.shape[0]
#Â re-indexing of test halves
x_testHalf.index = range(T_test)
y_testHalf.index = range(T_test)
return x_trainHalf, x_testHalf, y_trainHalf, y_testHalf, T_train, T_test
The effective training set will be a window of length $w_{train}$ at the end of the training set. The test set will be a window of same length $w_{test}$ at the beginning of the test set.
def setTrainAndTestSeries(x,y,frac=0.8,w_train=250,w_test=250,offset_train=0,col_x=None,col_y=None):
"""
Arg:
x: pd.DataFrame of X time series
y: pd.DataFrame of Y time series
frac: fraction of the total sample to keep as training set (0 < frac <= 1)
w_train: length of the training set
w_test: length of the test set
offset_train: the training window is [T_train - w_train - offset_train : T_train - offset_train].
If offset_train = 0 (default) the training window is [T_train - w_train : T_train],
ending in T_train.
col_x: str or list of X columns to filter (default: None). If None, all columns are returned.
col_y: str or list of Y columns to filter (default: None). If None, all columns are returned.
return:
x_train, x_test: 0 indexed train and test X time-series.
y_train, y_test: 0 indexed train and test Y time-series.
"""
# split original series in train and test periods
x_trainHalf, x_testHalf, y_trainHalf, y_testHalf, T_train, _ = splitTrainAndTestSeries(x,y,frac)
if col_x is None:
x_train = x_trainHalf.loc[T_train - w_train - offset_train : T_train - offset_train - 1, :]
x_test = x_testHalf.loc[:w_test-1,:]
else:
x_train = x_trainHalf.loc[T_train - w_train - offset_train : T_train - offset_train - 1, col_x]
x_test = x_testHalf.loc[:w_test-1,col_x]
if col_y is None:
y_train = y_trainHalf.loc[T_train - w_train - offset_train : T_train - offset_train - 1 , :]
y_test = y_testHalf.loc[:w_test-1,:]
else:
y_train = y_trainHalf.loc[T_train - w_train - offset_train : T_train - offset_train - 1, col_y]
y_test = y_testHalf.loc[:w_test-1,col_y]
# re-index of train set to start from zero
x_train.index = range(w_train)
y_train.index = range(w_train)
return x_train, x_test, y_train, y_test
The first model I tried is a linear regression model: $$ y_t = \alpha + \beta \cdot PCx_t + \epsilon $$ performed for each $y_{i,t}$, $i=1,...,5$ against each of the 6 PCs 'PCx1',...,'PCx6'. If we denote with $T_{train}$ the end of the training set period, the regression is runned regression is run over $[T_{train}-w_{train}:T_{train}]$ and the prediction is made over $[T_{train}:T_{train}+w_{test}]$ (default $w_{train}=w_{test}=250$).
def runLinearModel(x_train, x_test, y_train, y_test, IO=False):
# Running the linear regression
linearModelFit = regression.linear_model.OLS(y_train, sm.add_constant(x_train)).fit()
alpha = linearModelFit.params[0]
beta = linearModelFit.params[1]
y_train_pred = alpha + beta*x_train
y_train_pred.name = y_train.name + '_pred [trained]'
y_test_pred = alpha + beta*x_test
y_test_pred.name = y_test.name + '_pred [test]'
RMSE_train = np.sqrt(np.mean((y_train_pred.values - y_train.values) ** 2))
RMSE_test = np.sqrt(np.mean((y_test_pred.values - y_test.values) ** 2))
if IO:
print('RMSE in-sample = {0:.2f}'.format(RMSE_train))
print('RMSE out-of-sample = {0:.2f}'.format(RMSE_test))
print(linearModelFit.summary())
return linearModelFit, y_train_pred, y_test_pred, RMSE_train, RMSE_test
def plotModel(x_train, x_test, y_train, y_test, y_train_pred, y_test_pred, titleString='', plot_x = True, plotTrain=True, plotTest=True):
if plotTrain:
#Â in-sample
if plot_x:
x_train.plot(color='r')
y_train.plot(color='b')
y_train_pred.plot(color='k')
plt.title('In sample'+titleString)
plt.legend()
plt.show()
if plotTest:
# out-of-sample
if plot_x:
x_test.plot(color='r')
y_test.plot(color='b')
y_test_pred.plot(color='k')
plt.title('Out of sample'+titleString)
plt.legend()
plt.show()
return None
Results of linear regression model are quite poor, even if $(\alpha, \beta)$ are usually significant.
results = {}
results['Linear regression'] = {}
results['Linear regression']['Train'] = {}
results['Linear regression']['Test'] = {}
# loop over the y to be predicted
for y_i in df_y.columns:
results['Linear regression']['Train'][y_i] = {}
results['Linear regression']['Test'][y_i] = {}
# loop over the singel PCs used as predictors in a linear regression model
for PCx_j in df_PCx.columns:
# retrieve test series for training and prediction
x_train, x_test, y_train, y_test = setTrainAndTestSeries(x=df_PCx,y=df_y,
col_x=PCx_j,col_y=y_i)
# run the linear regression and retrieve predicted y (both in- and out- of sample)
linearModelFitted, y_train_pred, y_test_pred, RMSE_train, RMSE_test = runLinearModel(x_train, x_test,
y_train, y_test,
IO=True)
# plot model predictions (in- and out- of sample, for comparison)
plotModel(x_train, x_test, y_train, y_test, y_train_pred, y_test_pred)
results['Linear regression']['Train'][y_i]['LR_'+PCx_j] = RMSE_train
results['Linear regression']['Test'][y_i]['LR_'+PCx_j] = RMSE_test
trainResultLinearModel = pd.DataFrame(results['Linear regression']['Train'])
testResultLinearModel = pd.DataFrame(results['Linear regression']['Test'])
trainResultLinearModel.name='LR'
testResultLinearModel.name='LR'
trainResults = trainResultLinearModel
testResults = testResultLinearModel
To measure the quality of in- and out- of sample prediction, we can look at the in- and out- of sample RMSE (Root Mean Squared Error)
$$ RMSE = \sqrt{\frac{1}{w} \sum_{t=0}^{w-1} (y_t - \hat{y}_t)^2} $$
where $w=w_{train}$ (in-sample) $w=w_{test}$ (out-of-sample) and $$ \hat{y}_t = \alpha + \beta x_t $$ We see that:
ax=trainResults.plot()
ax.set_xticks(range(len(trainResults.index.values)+1))
ax.set_xticklabels(trainResults.index.values)
plt.title('RMSE In sample')
plt.tight_layout()
plt.show()
trainResultLinearModel
ax=testResults.plot()
ax.set_xticks(range(len(testResults.index.values)+1))
ax.set_xticklabels(testResults.index.values)
plt.title('RMSE Out of sample')
plt.tight_layout()
plt.show()
testResultLinearModel
Considering 'y4' as example, I would like to understand why 'PCx4' out-performs 'PCx2' out-of-sample $$ RMSE_{PCx4,test}=0.43 < RMSE_{PCx2,test}= 0.73 $$ given their similar performance in-sample $$ RMSE_{PCx4,train}=0.12 \simeq RMSE_{PCx2,test}= 0.13 $$
We can re-run the analysis to compare the two cases. Looking at the in- and out- of sample correlations, we see that PCx4 keeps a high degree of (anti-) correlation with y4 also in the test part (-0.96 and -0.85, respectively), while PCx2 becomes less correlated (from -0.96 to -0.37).
# loop over the y to be predicted
for y_i in ['y4']:
# loop over the singel PCs used as predictors in a linear regression model
for PCx_j in ['PCx2','PCx4']:
# retrieve test series for training and prediction
x_train, x_test, y_train, y_test = setTrainAndTestSeries(x=df_PCx,y=df_y,
col_x=PCx_j,col_y=y_i)
print('[train] corr('+PCx_j+','+y_i+') = {}'.format(pd.concat([x_train, y_train], axis=1).corr().iloc[0,1]))
print('[test] corr('+PCx_j+','+y_i+') = {}'.format(pd.concat([x_test, y_test], axis=1).corr().iloc[0,1]))
# run the linear regression and retrieve predicted y (both in- and out- of sample)
linearModelFitted, y_train_pred, y_test_pred, RMSE_train, RMSE_test = runLinearModel(x_train, x_test,
y_train, y_test,
IO=True)
# plot model predictions (in- and out- of sample, for comparison)
plotModel(x_train, x_test, y_train, y_test, y_train_pred, y_test_pred)
So, we see that, a change in the correlation regime between the train- and test- periods likely drives a deterioration of the prediction.
Intuitively, the presence of jumps in the time series, makes a model with a single $(\alpha,\beta)$, highly un-likely to be predictive.
Indeed, we can analyze the effect of jumps on rolling estimates of intercept and slope. The folliwing run the linear model over rollowing windows: $$ [T_{train}-w_{train}-500:T_{train}-500], [T_{train}-w_{train}-499:T_{train}-499], \dots, [T_{train}-w_{train}:T_{train}] $$
rolling_w = 500
alpha = []
beta = []
for roll_w in np.arange(rolling_w)[::-1]:
# retrieve test series for training and prediction
x_train, x_test, y_train, y_test = setTrainAndTestSeries(x=df_PCx,y=df_y,
offset_train=roll_w,
col_x='PCx1',col_y='y1')
# run the linear regression and retrieve predicted y (both in- and out- of sample)
linearModelFitted, _, _, _, _ = runLinearModel(x_train, x_test,
y_train, y_test,
IO=False)
# rolling intercept and slope values
alpha.append(linearModelFitted.params[0])
beta.append(linearModelFitted.params[1])
x, _, y, _ = setTrainAndTestSeries(x=df_PCx,y=df_y,
w_train = rolling_w,
offset_train=0,
col_x='PCx1',col_y='y1')
fig, ax1 = plt.subplots(figsize=(9,6))
idx = x.index
ax1.plot(idx, x.values, 'r-', label='PCx1')
ax1.plot(idx, y.values, 'b-', label='y1')
ax1.tick_params('y', colors='k')
plt.legend()
ax2 = ax1.twinx()
ax2.plot(idx, alpha, 'g-', label='alpha')
ax2.plot(idx, beta, 'm-', label='beta')
ax2.tick_params('y', colors='k')
plt.legend()
fig.tight_layout()
plt.show()
This calls for a model with $(\alpha, \beta)$:
spoiler: filter $(\alpha_t, \beta_t)$ as the latent state of a Kalman filter...
Another evidence of the need of a model with a richer dynamics is the fact that increasing the length of the window over which we train the model, $w_{train} = 250 \rightarrow w_{train} = 1000$, actually increases the $RMSE_{train}$, making even less reliable the model out-of-sample.
We test this over the best predictive model so far: $$ y_{5,t} = \alpha + \beta \cdot PCx_{4,t} + \epsilon $$ We keep $w_{test} = 250$.
results = {}
results['Linear regression'] = {}
results['Linear regression']['Train'] = {}
results['Linear regression']['Test'] = {}
increasing_w_train = np.arange(250,1010,10)
# loop over increasing w_train windows
for w in increasing_w_train:
# loop over the y to be predicted
for y_i in ['y5']:
# loop over the singel PCs used as predictors in a linear regression model
for PCx_j in ['PCx4']:
# retrieve test series for training and prediction
x_train, x_test, y_train, y_test = setTrainAndTestSeries(x=df_PCx,y=df_y,
w_train=w,
col_x=PCx_j,col_y=y_i)
# run the linear regression and retrieve predicted y (both in- and out- of sample)
linearModelFitted, y_train_pred, y_test_pred, RMSE_train, RMSE_test = runLinearModel(x_train, x_test,
y_train, y_test,
IO=False)
# plot model predictions (in- and out- of sample, for comparison)
#plotModel(x_train, x_test, y_train, y_test, y_train_pred, y_test_pred)
results['Linear regression']['Train'][w] = RMSE_train
results['Linear regression']['Test'][w] = RMSE_test
trainResult_Wtrain = pd.Series(results['Linear regression']['Train'], index=increasing_w_train)
testResult_Wtrain = pd.Series(results['Linear regression']['Test'], index=increasing_w_train)
ax=trainResult_Wtrain.plot()
plt.title('RMSE In sample')
plt.xlabel('$w_{train}$')
plt.tight_layout()
plt.show()
ax=testResult_Wtrain.plot()
plt.title('RMSE out sample')
plt.xlabel('$w_{test}$')
plt.tight_layout()
plt.show()
From these plots, it is clear that between 300 and 400, the in-sample RMSE increases abruptly and, at 400, the out-of-sample RMSE increases a lot as well. Let's see some plots (250,300,350,400,450). The problem seems to be amount of jumps included/excluded in the window $[T_{train}-w_{train}:T_{train}]$.
increasing_w_train = [250,300,350,400,450]
# loop over increasing w_train windows
for w in increasing_w_train:
# loop over the y to be predicted
for y_i in ['y5']:
# loop over the singel PCs used as predictors in a linear regression model
for PCx_j in ['PCx4']:
# retrieve test series for training and prediction
x_train, x_test, y_train, y_test = setTrainAndTestSeries(x=df_PCx,y=df_y,
w_train=w,
col_x=PCx_j,col_y=y_i)
# run the linear regression and retrieve predicted y (both in- and out- of sample)
linearModelFitted, y_train_pred, y_test_pred, RMSE_train, RMSE_test = runLinearModel(x_train, x_test,
y_train, y_test,
IO=False)
print('w_train = {0:2d}, [RMSE_train = {1:.2f}; RMSE_test = {2:.2f}], alpha = {3:.2f}, beta = {4:.2f}'.format(w,
RMSE_train, RMSE_test,
linearModelFitted.params[0],
linearModelFitted.params[1]))
# plot model predictions (in- and out- of sample, for comparison)
plotModel(x_train, x_test, y_train, y_test, y_train_pred, y_test_pred,' ($w_{train} ='+str(w)+'$)',plotTest=True)
Before revert to time-dependent $(\alpha, \beta)$, I try to understand whether all the PCs together, have a better predicting power. Therefore I fit the following multiple linear regression model:
$$ y_t = \alpha + \sum_{j=1}^6 \beta_j \cdot PCx_{j,t} + \epsilon $$ performed for each $y_{i,t}$, $i=1,...,5$ over windows defined as before $[T_{train}-w_{train}:T_{train}]$ and tested over $[T_{train}:T_{train}+w_{test}]$ (default $w_{train}=w_{test}=250$).
def runMultipleLinearRegressionModel(x_train, x_test, y_train, y_test, IO=False):
# Running the multiple linear regression
multipleLinearModelFit = regression.linear_model.OLS(y_train, sm.add_constant(x_train)).fit()
y_train_pred = multipleLinearModelFit.params[0] + x_train.mul(multipleLinearModelFit.params[1:]).sum(axis=1)
y_train_pred.name = y_train.name + '_pred [trained]'
y_test_pred = multipleLinearModelFit.params[0] + x_test.mul(multipleLinearModelFit.params[1:]).sum(axis=1)
y_test_pred.name = y_test.name + '_pred [test]'
RMSE_train = np.sqrt(np.mean((y_train_pred.values - y_train.values) ** 2))
RMSE_test = np.sqrt(np.mean((y_test_pred.values - y_test.values) ** 2))
if IO:
print('RMSE in-sample = {0:.2f}'.format(RMSE_train))
print('RMSE out-of-sample = {0:.2f}'.format(RMSE_test))
print(multipleLinearModelFit.summary())
return multipleLinearModelFit, y_train_pred, y_test_pred, RMSE_train, RMSE_test
results = {}
results['Multiple Linear Regression'] = {}
results['Multiple Linear Regression']['Train'] = {}
results['Multiple Linear Regression']['Test'] = {}
# loop over the y to be predicted
for y_i in df_y.columns:
results['Multiple Linear Regression']['Train'][y_i] = {}
results['Multiple Linear Regression']['Test'][y_i] = {}
# retrieve test series for training and prediction
x_train, x_test, y_train, y_test = setTrainAndTestSeries(x=df_PCx,y=df_y, col_y=y_i)
# run the multiple linear regression and retrieve predicted y (both in- and out- of sample)
multipleLinearModelFitted, y_train_pred, y_test_pred, RMSE_train, RMSE_test = runMultipleLinearRegressionModel(x_train, x_test,
y_train, y_test,
IO=True)
# plot model predictions (in- and out- of sample, for comparison)
plotModel(x_train, x_test, y_train, y_test, y_train_pred, y_test_pred, plot_x=False)
results['Multiple Linear Regression']['Train'][y_i] = RMSE_train
results['Multiple Linear Regression']['Test'][y_i] = RMSE_test
trainResultMultipleLinearModel = pd.Series(results['Multiple Linear Regression']['Train'])
testResultMultipleLinearModel = pd.Series(results['Multiple Linear Regression']['Test'])
trainResultMultipleLinearModel.name='MLR'
testResultMultipleLinearModel.name='MLR'
As expected, despite the in-sample improvement, the MLR model doesn't outperform the single linear model in terms of predicting power. Moreover, usually all MLR betas are not significant, except the one linked to the best one-factor linear model.
trainResults = trainResultLinearModel.append(trainResultMultipleLinearModel)
testResults = testResultLinearModel.append(testResultMultipleLinearModel)
ax=trainResults.plot()
ax.set_xticks(range(len(trainResults.index.values)+1))
ax.set_xticklabels(trainResults.index.values)
plt.title('RMSE In sample')
plt.tight_layout()
plt.show()
trainResults
ax=testResults.plot()
ax.set_xticks(range(len(testResults.index.values)))
ax.set_xticklabels(testResults.index.values)
plt.title('RMSE Out of sample')
plt.tight_layout()
plt.show()
testResults
Before introducing the Kalman filter approach, let's forget for a moment the PCs time series and get back to the original Xs. As a model selection approach, the LASSO automatically selects the relevant beta. The loss minimized is ($N=70000$ in our sample):
$$ \frac{1}{2N} || y - \sum_{k=1}^{56} \beta_k x_{k} ||^2_2 + \lambda \sum_{k=1}^{56} |\beta_k| $$ performed over windows defined as before $[T_{train}-w_{train}:T_{train}]$ and tested over $[T_{train}:T_{train}+w_{test}]$ (default $w_{train}=w_{test}=250$).
I decided to use the LassoLarsCV version of the Lasso algorithm which performs automatically a cross-validation to infer the optimal weight $\lambda$.
def runLassoRegressionModel(x_train, x_test, y_train, y_test, IO=False):
# Running the LASSO regression
lassoModelFit = LassoLarsCV(cv=10).fit(x_train,y_train)
y_train_pred = pd.Series(data=lassoModelFit.predict(x_train), index=y_train.index)
y_train_pred.name = y_train.name + '_pred [trained]'
y_test_pred = pd.Series(data=lassoModelFit.predict(x_test), index=y_test.index)
y_test_pred.name = y_test.name + '_pred [test]'
RMSE_train = np.sqrt(np.mean((y_train_pred.values - y_train.values) ** 2))
RMSE_test = np.sqrt(np.mean((y_test_pred.values - y_test.values) ** 2))
if IO:
print('RMSE in-sample = {0:.2f}'.format(RMSE_train))
print('RMSE out-of-sample = {0:.2f}'.format(RMSE_test))
return lassoModelFit, y_train_pred, y_test_pred, RMSE_train, RMSE_test
warnings.filterwarnings("ignore")
results = {}
results['Lasso Regression'] = {}
results['Lasso Regression']['Train'] = {}
results['Lasso Regression']['Test'] = {}
# loop over the y to be predicted
for y_i in df_y.columns:
results['Lasso Regression']['Train'][y_i] = {}
results['Lasso Regression']['Test'][y_i] = {}
# retrieve test series for training and prediction
x_train, x_test, y_train, y_test = setTrainAndTestSeries(x=df_x,y=df_y, col_y=y_i)
# run the multiple linear regression and retrieve predicted y (both in- and out- of sample)
lassoModelFitted, y_train_pred, y_test_pred, RMSE_train, RMSE_test = runLassoRegressionModel(x_train, x_test,
y_train, y_test,
IO=True)
# plot model predictions (in- and out- of sample, for comparison)
plotModel(x_train, x_test, y_train, y_test, y_train_pred, y_test_pred, plot_x=False)
results['Lasso Regression']['Train'][y_i] = RMSE_train
results['Lasso Regression']['Test'][y_i] = RMSE_test
trainResultLassoModel = pd.Series(results['Lasso Regression']['Train'])
testResultLassoModel = pd.Series(results['Lasso Regression']['Test'])
trainResultLassoModel.name='Lasso'
testResultLassoModel.name='Lasso'
Results of the Lasso algorithm are not substantially different from those obtained with the MLR or even with the best single PCx model for the given y.
trainResults = trainResults.append(trainResultLassoModel)
testResults = testResults.append(testResultLassoModel)
ax=trainResults.plot()
ax.set_xticks(range(len(trainResults.index.values)+1))
ax.set_xticklabels(trainResults.index.values)
plt.title('RMSE In sample')
plt.tight_layout()
plt.show()
trainResults
ax=testResults.plot()
ax.set_xticks(range(len(testResults.index.values)))
ax.set_xticklabels(testResults.index.values)
plt.title('RMSE Out of sample')
plt.tight_layout()
plt.show()
testResults
The LASSO algorithm finds optimal to include 6 Xs as regressors.
Here is the path of the $\beta_k$, when varying the weight $\lambda$ (here, plotted up to the optimal $\lambda \simeq 8 \times 10^{-4}$ )
print('lambda={0:.4f}'.format(lassoModelFitted.alpha_))
m_log_lambdas = -np.log10(lassoModelFitted.alphas_)
fig = plt.figure(figsize=(10,7))
ax = plt.gca()
plt.plot(m_log_lambdas, lassoModelFitted.coef_path_.T)
plt.axvline(-np.log10(lassoModelFitted.alpha_), linestyle='--', color='k', label='$\lambda$ 10-fold cross-validated')
plt.ylabel('Regression Coefficients')
plt.xlabel('$-\log(\lambda)$')
plt.title('Regression Coefficients Progression for Lasso Paths')
plt.legend()
plt.show()
Here I show the coefficients $\beta_k$ assigned to each $x_k$ $k=1, \dots, 56$ and below, the corresponding $x_k$ in the training set.
fig = plt.figure(figsize=(15,8))
pd.Series(data=lassoModelFitted.coef_, index=df_x.columns).plot('bar', color='r')
pd.Series(data=0.0*lassoModelFitted.coef_, index=df_x.columns).plot(color='k')
plt.show()
beta_k = pd.Series(data=lassoModelFitted.coef_, index=df_x.columns)
x_k = x_train.loc[:,beta_k[beta_k > 0].index]
x_k.plot(subplots=True, figsize=(7,5*6), legend=False, title=list(x_k.columns), color='r')
plt.show()
We compute time-varying estimates of (alpha, beta) following:
def runKalmanFilterModel(x_train, x_test, y_train, y_test, IO=False, optimized=False, online_prediction=False):
# define the observation matrix when (beta, alpha) is the state vector
obs_mat = np.expand_dims(np.vstack([[x_train], [np.ones(len(x_train))]]).T, axis=1)
if optimized:
kf = KalmanFilter(n_dim_obs=1, n_dim_state=2, # y is 1-dimensional, (alpha, beta) is 2-dimensional
transition_matrices=np.eye(2),
observation_matrices=obs_mat)
# performs 10 EM algorithm iterations to optimize the covariances and initial guesses
#Â (the number of iterations is kept to 1 to avoid, as much as possible, overfitting of the training set)
kf.em(y_train.values, n_iter=1, em_vars=['transition_covariance',
'observation_covariance',
'initial_state_mean',
'initial_state_covariance'])
else:
# define the noises in observations
delta = 1e-3
trans_cov = delta / (1 - delta) * np.eye(2) # How much random walk wiggles
kf = KalmanFilter(n_dim_obs=1, n_dim_state=2, # y is 1-dimensional, (alpha, beta) is 2-dimensional
initial_state_mean=[0,0],
initial_state_covariance=np.ones((2, 2)),
transition_matrices=np.eye(2),
observation_matrices=obs_mat,
observation_covariance=1.0,
transition_covariance=trans_cov)
# Use the observations y to get running estimates and errors for the state parameters
state_means, state_covs = kf.filter(y_train.values)
# filtered (beta, alpha) state
alpha = state_means[:,1]
beta = state_means[:,0]
y_train_pred = alpha + beta*x_train
y_train_pred.name = y_train.name + '_pred [trained]'
if online_prediction:
y_test_pred = pd.Series(index=y_test.index)
#Â initialize the state to be updated to the last filtered in training
estimated_state_mean, estimated_state_covs = state_means[-1], state_covs[-1]
# loop over the test set - BUT I'M USING ONLY THE X!
for t in x_test.index:
# we make a NOISY guess y_{t} | info up to t-1, given:
#Â the estimate (beta, alpha)_{t-1}
#Â x_t
lag=5
if t < lag:
if t == 0:
y_t_guess = y_train.tail(lag).values.mean()
else:
pass # y_t_guess not updated in the first lag steps
else:
y_t_guess = y_test[t-lag:t].mean() # Note well: y_t is not included in the averaging
#y_t_guess = estimated_state_mean[1] + estimated_state_mean[0]*x_test[t] # exact KF forecast, tauthologic
#y_t_guess = estimated_state_mean[1] + estimated_state_mean[0]*x_test[t] + np.random.randn() # with same observation noise
#print('y_t_guess = ', y_t_guess)
#print('alpha_old = ', estimated_state_mean[1])
#print('beta_old = ', estimated_state_mean[0])
#Â Compute the estimate: (beta, alpha)_t of the state for time t, given:
#Â the estimate (beta, alpha)_{t-1}
#Â the guess: y_{t} | info up to t-1
#Â x_t
estimated_state_mean, estimated_state_covs = kf.filter_update(filtered_state_mean=estimated_state_mean,
filtered_state_covariance=estimated_state_covs,
observation=y_t_guess,
observation_matrix=np.array([[x_test[t],1.0]]))
#print('alpha_new = ', estimated_state_mean[1])
#print('beta_new = ', estimated_state_mean[0])
# given x_t and the estimate (beta, alpha)_t just computed,
# make the prediction for y_t
y_test_pred[t] = estimated_state_mean[1] + estimated_state_mean[0]*x_test[t]
#print('y_test_pred = ', y_test_pred[t])
else:
# for out-of-sample prediction use the last filtered value of the (beta, alpha) state
y_test_pred = alpha[-1] + beta[-1]*x_test
y_test_pred.name = y_test.name + '_pred [test]'
RMSE_train = np.sqrt(np.mean((y_train_pred.values - y_train.values) ** 2))
RMSE_test = np.sqrt(np.mean((y_test_pred.values - y_test.values) ** 2))
if IO:
print('RMSE in-sample = {0:.2f}'.format(RMSE_train))
print('RMSE out-of-sample = {0:.2f}'.format(RMSE_test))
return kf, y_train_pred, y_test_pred, RMSE_train, RMSE_test, alpha, beta
results = {}
results['Kalman Filter'] = {}
results['Kalman Filter']['Train'] = {}
results['Kalman Filter']['Test'] = {}
# loop over the y to be predicted
for y_i in df_y.columns:
results['Kalman Filter']['Train'][y_i] = {}
results['Kalman Filter']['Test'][y_i] = {}
# loop over the singel PCs used as predictors in a linear regression model
for PCx_j in df_PCx.columns:
# retrieve test series for training and prediction
x_train, x_test, y_train, y_test = setTrainAndTestSeries(x=df_PCx,y=df_y,
col_x=PCx_j,col_y=y_i)
# run the linear regression and retrieve predicted y (both in- and out- of sample)
KFModelFitted, y_train_pred, y_test_pred, RMSE_train, RMSE_test, _, _ = runKalmanFilterModel(x_train, x_test,
y_train, y_test,
IO=True)
# plot model predictions (in- and out- of sample, for comparison)
plotModel(x_train, x_test, y_train, y_test, y_train_pred, y_test_pred)
results['Kalman Filter']['Train'][y_i]['KF_'+PCx_j] = RMSE_train
results['Kalman Filter']['Test'][y_i]['KF_'+PCx_j] = RMSE_test
trainResultKFModel = pd.DataFrame(results['Kalman Filter']['Train'])
testResultKFModel = pd.DataFrame(results['Kalman Filter']['Test'])
trainResultKFModel.name = 'KF'
testResultKFModel.name = 'KF'
trainResults = trainResults.append(trainResultKFModel)
testResults = testResults.append(testResultKFModel)
ax=trainResults.plot(figsize=(10,6))
ax.set_xticks(range(len(trainResults.index.values)+1))
ax.set_xticklabels(trainResults.index.values)
plt.title('RMSE In sample')
plt.tight_layout()
plt.show()
trainResults
ax = testResults.plot(figsize=(10,6))
ax.set_xticks(range(len(testResults.index.values)))
ax.set_xticklabels(testResults.index.values)
plt.title('RMSE Out of sample')
plt.tight_layout()
plt.show()
testResults
DESCRIPTION
This model is clearly overfitting the training set, even if the number of iterations of the EM algorithm is just 1.ù
results = {}
results['Kalman Filter Optimized'] = {}
results['Kalman Filter Optimized']['Train'] = {}
results['Kalman Filter Optimized']['Test'] = {}
# loop over the y to be predicted
for y_i in df_y.columns:
results['Kalman Filter Optimized']['Train'][y_i] = {}
results['Kalman Filter Optimized']['Test'][y_i] = {}
# loop over the singel PCs used as predictors in a linear regression model
for PCx_j in df_PCx.columns:
# retrieve test series for training and prediction
x_train, x_test, y_train, y_test = setTrainAndTestSeries(x=df_PCx,y=df_y,
col_x=PCx_j,col_y=y_i)
# run the linear regression and retrieve predicted y (both in- and out- of sample)
KFModelFitted, y_train_pred, y_test_pred, RMSE_train, RMSE_test, _, _ = runKalmanFilterModel(x_train, x_test,
y_train, y_test,
IO=True,
optimized=True)
# plot model predictions (in- and out- of sample, for comparison)
plotModel(x_train, x_test, y_train, y_test, y_train_pred, y_test_pred)
results['Kalman Filter Optimized']['Train'][y_i]['KF_Opt_'+PCx_j] = RMSE_train
results['Kalman Filter Optimized']['Test'][y_i]['KF_Opt_'+PCx_j] = RMSE_test
trainResultKF_Opt_Model = pd.DataFrame(results['Kalman Filter Optimized']['Train'])
testResultKF_Opt_Model = pd.DataFrame(results['Kalman Filter Optimized']['Test'])
trainResultKF_Opt_Model.name = 'KF_Opt'
testResultKF_Opt_Model.name = 'KF_Opt'
trainResults = trainResults.append(trainResultKF_Opt_Model)
testResults = testResults.append(testResultKF_Opt_Model)
print('Best model in-sample by y:')
print(trainResults.idxmin())
ax=trainResults.plot(figsize=(15,6))
ax.set_xticks(range(len(trainResults.index.values)+1))
ax.set_xticklabels(trainResults.index.values)
plt.title('RMSE In sample')
plt.xticks(rotation=-90)
plt.tight_layout()
plt.show()
trainResults
print('Best model out-of-sample by y:')
print(testResults.idxmin())
ax = testResults.plot(figsize=(15,6))
ax.set_xticks(range(len(testResults.index.values)))
ax.set_xticklabels(testResults.index.values)
plt.title('RMSE Out of sample')
plt.xticks(rotation=-90)
plt.tight_layout()
plt.show()
testResults
uses x(t) to predict y(t)
results = {}
results['Kalman Filter Online'] = {}
results['Kalman Filter Online']['Train'] = {}
results['Kalman Filter Online']['Test'] = {}
# loop over the y to be predicted
for y_i in df_y.columns:
results['Kalman Filter Online']['Train'][y_i] = {}
results['Kalman Filter Online']['Test'][y_i] = {}
# loop over the singel PCs used as predictors in a linear regression model
for PCx_j in df_PCx.columns:
# retrieve test series for training and prediction
x_train, x_test, y_train, y_test = setTrainAndTestSeries(x=df_PCx,y=df_y,
col_x=PCx_j,col_y=y_i)
# run the linear regression and retrieve predicted y (both in- and out- of sample)
KFModelFitted, y_train_pred, y_test_pred, RMSE_train, RMSE_test, _, _ = runKalmanFilterModel(x_train, x_test,
y_train, y_test,
IO=True,
online_prediction=True)
# plot model predictions (in- and out- of sample, for comparison)
plotModel(x_train, x_test, y_train, y_test, y_train_pred, y_test_pred)
results['Kalman Filter Online']['Train'][y_i]['KF_Online_'+PCx_j] = RMSE_train
results['Kalman Filter Online']['Test'][y_i]['KF_Online_'+PCx_j] = RMSE_test
trainResultKF_Online_Model = pd.DataFrame(results['Kalman Filter Online']['Train'])
testResultKF_Online_Model = pd.DataFrame(results['Kalman Filter Online']['Test'])
trainResultKF_Online_Model.name = 'KF_Online'
testResultKF_Online_Model.name = 'KF_Online'
trainResults = trainResults.append(trainResultKF_Online_Model)
testResults = testResults.append(testResultKF_Online_Model)
print('Best model in-sample by y:')
print(trainResults.idxmin())
ax=trainResults.plot(figsize=(15,6))
ax.set_xticks(range(len(trainResults.index.values)+1))
ax.set_xticklabels(trainResults.index.values)
plt.title('RMSE In sample')
plt.xticks(rotation=-90)
plt.tight_layout()
plt.show()
trainResults
print('Best model out-of-sample by y:')
print(testResults.idxmin())
ax = testResults.plot(figsize=(15,6))
ax.set_xticks(range(len(testResults.index.values)))
ax.set_xticklabels(testResults.index.values)
plt.title('RMSE Out of sample')
plt.xticks(rotation=-90)
plt.tight_layout()
plt.show()
testResults
y_test[10]
y_test[10-5:11].mean()
y_train.tail(5).values.mean()